{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction: Data Creation\n",
    "\n",
    "In this notebook we create an example dataset to be used for automated feature engineering. I have included this code in the repository for posterity and because at some point it may come in use for generating additional example datasets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "TMJOuoK-3TwZ"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from datetime import datetime\n",
    "import random\n",
    "\n",
    "rand_dates = []\n",
    " \n",
    "for _ in range(1000):\n",
    "  \n",
    "  year = random.choice(range(2000, 2015))\n",
    "  month = random.choice(range(1, 13))\n",
    "  day = random.choice(range(1, 29))\n",
    "  rdate = datetime(year, month, day)\n",
    "  rand_dates.append(rdate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "ZHbTWJ4W8aRO"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>client_id</th>\n",
       "      <th>joined</th>\n",
       "      <th>income</th>\n",
       "      <th>credit_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>46109</td>\n",
       "      <td>2002-04-16</td>\n",
       "      <td>172677</td>\n",
       "      <td>527</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>49545</td>\n",
       "      <td>2007-11-14</td>\n",
       "      <td>104564</td>\n",
       "      <td>770</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>41480</td>\n",
       "      <td>2013-03-11</td>\n",
       "      <td>122607</td>\n",
       "      <td>585</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>46180</td>\n",
       "      <td>2001-11-06</td>\n",
       "      <td>43851</td>\n",
       "      <td>562</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>25707</td>\n",
       "      <td>2006-10-06</td>\n",
       "      <td>211422</td>\n",
       "      <td>621</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  client_id     joined  income credit_score\n",
       "0     46109 2002-04-16  172677          527\n",
       "1     49545 2007-11-14  104564          770\n",
       "2     41480 2013-03-11  122607          585\n",
       "3     46180 2001-11-06   43851          562\n",
       "4     25707 2006-10-06  211422          621"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clients = pd.DataFrame(columns = ['client_id', 'joined', 'income', 'credit_score'])\n",
    "for _ in range(25):\n",
    "  clients = clients.append(pd.DataFrame({'client_id': np.random.randint(25000, 50000, size = 1)[0], 'joined': random.choice(rand_dates),\n",
    "                           'income': np.random.randint(30500, 240000, size = 1)[0], 'credit_score': np.random.randint(500, 850, size = 1)[0]},\n",
    "                                        index = [0]), ignore_index = True)\n",
    "\n",
    "clients.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "X7KOWMe19qBe"
   },
   "outputs": [],
   "source": [
    "loans = pd.DataFrame(columns = ['client_id', 'loan_type', 'loan_amount', 'repaid',\n",
    "                                         'loan_id', 'loan_start', 'loan_end', 'rate'])\n",
    "\n",
    "for client in clients['client_id'].unique():\n",
    "  for _ in range(20):\n",
    "    time_created = pd.datetime(np.random.randint(2000, 2015, size = 1)[0],\n",
    "                               np.random.randint(1, 13, size = 1)[0],\n",
    "                               np.random.randint(1, 30, size = 1)[0])\n",
    "\n",
    "    time_ended = time_created + pd.Timedelta(days = np.random.randint(500, 1000, size = 1)[0])\n",
    "\n",
    "    loans = loans.append(pd.DataFrame({'client_id': client, 'loan_type': random.choice(['cash', 'credit', 'home', 'other']),\n",
    "                                                         'loan_amount': np.random.randint(500, 15000, size = 1)[0],\n",
    "                                                         'repaid': random.choice([0, 1]), \n",
    "                                                         'loan_id': np.random.randint(10000, 12000, size = 1)[0],\n",
    "                                                         'loan_start': time_created,\n",
    "                                                         'loan_end': time_ended,\n",
    "                                                          'rate': round(abs(4 * np.random.randn(1)[0]), 2)}, index = [0]), ignore_index = True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "S691c7oN_uVh"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>client_id</th>\n",
       "      <th>loan_type</th>\n",
       "      <th>loan_amount</th>\n",
       "      <th>repaid</th>\n",
       "      <th>loan_id</th>\n",
       "      <th>loan_start</th>\n",
       "      <th>loan_end</th>\n",
       "      <th>rate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>46109</td>\n",
       "      <td>home</td>\n",
       "      <td>13672</td>\n",
       "      <td>0</td>\n",
       "      <td>10243</td>\n",
       "      <td>2002-04-16</td>\n",
       "      <td>2003-12-20</td>\n",
       "      <td>2.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>46109</td>\n",
       "      <td>credit</td>\n",
       "      <td>9794</td>\n",
       "      <td>0</td>\n",
       "      <td>10984</td>\n",
       "      <td>2003-10-21</td>\n",
       "      <td>2005-07-17</td>\n",
       "      <td>1.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>46109</td>\n",
       "      <td>home</td>\n",
       "      <td>12734</td>\n",
       "      <td>1</td>\n",
       "      <td>10990</td>\n",
       "      <td>2006-02-01</td>\n",
       "      <td>2007-07-05</td>\n",
       "      <td>0.68</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>46109</td>\n",
       "      <td>cash</td>\n",
       "      <td>12518</td>\n",
       "      <td>1</td>\n",
       "      <td>10596</td>\n",
       "      <td>2010-12-08</td>\n",
       "      <td>2013-05-05</td>\n",
       "      <td>1.24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>46109</td>\n",
       "      <td>credit</td>\n",
       "      <td>14049</td>\n",
       "      <td>1</td>\n",
       "      <td>11415</td>\n",
       "      <td>2010-07-07</td>\n",
       "      <td>2012-05-21</td>\n",
       "      <td>3.13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  client_id loan_type loan_amount repaid loan_id loan_start   loan_end  rate\n",
       "0     46109      home       13672      0   10243 2002-04-16 2003-12-20  2.15\n",
       "1     46109    credit        9794      0   10984 2003-10-21 2005-07-17  1.25\n",
       "2     46109      home       12734      1   10990 2006-02-01 2007-07-05  0.68\n",
       "3     46109      cash       12518      1   10596 2010-12-08 2013-05-05  1.24\n",
       "4     46109    credit       14049      1   11415 2010-07-07 2012-05-21  3.13"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loans.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "5CEOao2oAbco"
   },
   "outputs": [],
   "source": [
    "payments = pd.DataFrame(columns = ['loan_id', 'payment_amount', \n",
    "                                    'payment_date', 'missed'])\n",
    "\n",
    "for _, row in loans.iterrows():\n",
    "  time_created = row['loan_start']\n",
    "  payment_date = time_created + pd.Timedelta(days = 30)\n",
    "  loan_amount = row['loan_amount']\n",
    "  loan_id = row['loan_id']\n",
    "  payment_id = np.random.randint(10000, 12000, size = 1)[0]\n",
    "  for _ in range(np.random.randint(5, 10, size = 1)[0]):\n",
    "    payment_id += 1\n",
    "    payment_date += pd.Timedelta(days = np.random.randint(10, 50, size = 1)[0])\n",
    "    payments = payments.append(pd.DataFrame({'loan_id': loan_id, \n",
    "                                                               'payment_amount': np.random.randint(int(loan_amount / 10), int(loan_amount / 5), size = 1)[0],\n",
    "                                                               'payment_date': payment_date, 'missed': random.choice([0, 1])}, index = [0]), ignore_index = True)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "6rhMZtlzDKUg"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>loan_id</th>\n",
       "      <th>payment_amount</th>\n",
       "      <th>payment_date</th>\n",
       "      <th>missed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10243</td>\n",
       "      <td>2369</td>\n",
       "      <td>2002-05-31</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10243</td>\n",
       "      <td>2439</td>\n",
       "      <td>2002-06-18</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10243</td>\n",
       "      <td>2662</td>\n",
       "      <td>2002-06-29</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10243</td>\n",
       "      <td>2268</td>\n",
       "      <td>2002-07-20</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10243</td>\n",
       "      <td>2027</td>\n",
       "      <td>2002-07-31</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  loan_id payment_amount payment_date missed\n",
       "0   10243           2369   2002-05-31      1\n",
       "1   10243           2439   2002-06-18      1\n",
       "2   10243           2662   2002-06-29      0\n",
       "3   10243           2268   2002-07-20      0\n",
       "4   10243           2027   2002-07-31      1"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "payments.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "VQcf7a61DODf"
   },
   "outputs": [],
   "source": [
    "clients = clients.drop_duplicates(subset = 'client_id')\n",
    "loans = loans.drop_duplicates(subset = 'loan_id')\n",
    "\n",
    "\n",
    "clients.to_csv('clients.csv', index = False)\n",
    "loans.to_csv('loans.csv', index = False)\n",
    "payments.to_csv('payments.csv', index = False)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "default_view": {},
   "name": "Feature Engineering Data Creation.ipynb",
   "private_outputs": true,
   "provenance": [],
   "version": "0.3.2",
   "views": {}
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
